####################################### LINEAR REGRESSION #######################################
options(show.signif.stars=FALSE, digit=4)

###################### Roller Data Example ######################
roller<-read.table("roller.txt",header=TRUE) 
head(roller, n=3)
roller; attach(roller)                                
y = depression; bar_x = mean(x)                                   
x = weight    ; bar_y = mean(y)                                   
detach(roller)
n = length(x)
s_xx = sum((x-bar_x)^2)                        # sum of squares of the predictor
	s_xx; sd(x)^2*(n-1)                            # check the formula with 'sd'
s_xy = sum((x-bar_x)*(y-bar_y))                # sum of cross products
	s_xy; cov(y,x)*(n-1)                           # check the formula with 'cov'
hat.beta_1 = s_xy/s_xx                         # least square estimate of slope
hat.beta_0 = bar_y - hat.beta_1 * bar_x        # least square estimate of intercept
	c(intercept=hat.beta_0,slope=hat.beta_1)      

plot(x,y,xlab="",ylab="",main="")             # scatterplot
abline(a=hat.beta_0,b=hat.beta_1)             # add the regression line
	title(xlab=list("Roller weight (t)",cex=1), ylab=list("Depression in lawn (mm)",cex=1))
	mtext("Depression vs weight, roller data",side=3,line=1,cex=1.2)
                                           
###################### R function 'lm()'  ######################
homedata<-read.table("homedata.txt",header=TRUE)
head(homedata, n=3)
attach(homedata)
	plot(y1970,y2000)
	cor(y1970,y2000)                        # as suggested by the scatterplot, there is a linear (and positive) relation 
res = lm(y2000~y1970)                         # IMP - linear models - 'response~predictor'
	coef(res)
	names(res)
	res$coefficients 
plot(y1970,y2000); abline(res, col=2, lwd=2) 

x.star = 50000                                # for a given x value
	coef(res)[1] + coef(res)[2]*x.star            # [b0 + b1 * x]
	t(coef(res))%*%c(1,x.star)                    # same using matrix algebra
	predict(res,data.frame(y1970=x.star))         

resid(res)                                    # epsilon_i 
y2000-fitted(res)                             # Y - Y_cappuccio
predict(res,data.frame(y1970=55100))          # perform a prediction as above with x=55100

lm(y2000~y1970, data=homedata)                # the use of model formula offers the flexibility
                                              # to specify the variable names of the data frame
                                              # without attaching the data frame

###################### Outliers in the regression model - REGULARITIES IN THE VOTE ######################
florida<-read.table("florida.txt",header=TRUE)
head(florida, n=3) 
attach(florida); str(florida); head(florida,n=3) 
# candidates were Pat Buchanan and George Bush: is there some relationship between # votes for the two candidates, since both conservatives candidates?

plot(BUCHANAN~BUSH, data=florida)        
res = lm(BUCHANAN~BUSH, data=florida)        
abline(res)        
with(florida, identify(BUSH,BUCHANAN,n=2, labels=County))        
	# identify points by mouse over them. It gives (row indexes). 
	# 'n=2' specify the number of points to identify. 
	# 'labels=County' is optional and allows for the placement of other text

# ora identifichiamo i due outliers  
County[c(13,50)]
class(County)
ycapp = predict(res,data.frame(BUSH=BUSH[c(13,50)]))
y = BUCHANAN[c(13,50)]
residui = y - ycapp
	residuals(res)[c(13,50)]    # predicted amount and residual for Dade and Palm Beach

######################  Simulated data ######################
age = rep(seq(20,60,by=5), 3)                 # In sport activity heart rates are used for optimal training
mhr = 209-0.7*age+rnorm(length(age),sd=4)     
plot(mhr~age, main="Age versus maximum heart rate")    
res.mhr = lm(mhr~age); coef(res.mhr); abline(res.mhr,lty=2)   # regression line 
	c(true_intercept=209, true_slope=-0.7)      
	abline(a=209,b=-0.7)                          # actual model
	legend(40,195,bty="n",legend=c("Actual model", "Estimated regression line"),lty=c(1,2))
       
RSS1 = sum(resid(res.mhr)^2)/(length(age)-2)         # RSS=Residual Sum of Squares
RSS2 = deviance(res.mhr)/(length(age)-2) 

#Testing model assumption
	help(plot.lm); plot(res.mhr,which=4)
	old.par = par(no.readonly = TRUE)            
	par(mfrow=c(2,2))
	plot(res.mhr,which=1:4)

par(old.par)                             
cooks.distance(res.mhr)
	summary(res.mhr)$resid
plot(res.mhr, which=4)
	plot(1:length(age),cooks.distance(res.mhr),xlab="Obs. number", ylab="Cook's distance",type="h")
	length(cooks.distance(res.mhr))
	cooks = cooks.distance(res.mhr)
	ind = which(cooks>0.08)
	text(ind,cooks[ind],labels=paste(ind),pos=3,cex=0.8)

###################### Assessing linearity - RESIDUAL PLOT ######################
x = rep(1:10,4)                             
y = 5*sin(x) + rnorm(40,sd=1)                 # the relation between x and y is sinusoidal trend
plot(x,y)
res = lm(y~x)   
names(res)          

old.par = par(no.readonly = TRUE)            
	par(fig=c(0, 0.5, 0.2, 0.8))                  
	plot(y~x); abline(res)                        
	curve(5*sin(x), lty=2, col=2, lwd=2, add=T)   
	par(fig=c(0.5, 1, 0.2, 0.8), new=TRUE)       
	plot(resid(res)~fitted(res))			#they should scatter around the horizontal line at zero
par(old.par)
plot(res,which=1)                             # it corresponds to the first plot

###################### Normality of the residual - NORMAL QQ PLOT ######################
x = rep(1:10,4)                               # this data set is created to fulfill the normality PREDICTOR
epsilon = rnorm(40,sd=1)                      # assumption but not the independence of the error 
y = 1+2*x+cumsum(epsilon)                     # In this case the error cumulates
	plot(x,y)
res = lm(y~x)
	e = resid(res)                                # residual
	std.e = (e-mean(e))/sd(e)                     # standardized residual ERROR CUMULATES OVER OBS
	qqnorm(std.e);abline(a=0,b=1,lty=2)           # qqplot to assess normality - INDEP NOT SATISF
plot(res,which=2)                             # it corresponds to the second plot

plot(e[-40],e[-1])                            # IMP - the lag plot should be scattered if the errors 
cor(e[-40],e[-1])                             # were uncorrelated

plot(acf(e))                                  # autocorrelation function, first bar is 1
                                              # second bar is cor(e[-40],e[-1]), 
                                              # third bar is cor(e[c(-40,.39)],e[c(-1,-2)]) and so on
                                              # they should be all small in absolute value 
							                  # INDEPENDENCE IF ALL VALUE AROUND 0

####################### THE ERROR TERMS MUST HAVE COMMON VARIANCE ######################
x = rep(1:10,4)       				    # PREDICTOR VARIABLE
y = 1+(1/2)*x+rnorm(40,sd = x/10)             # the error terms have variance proportional to the values of the predictor
plot(x,y)                                     # A DIFFERENT SD PER OGNI PUNTO
res = lm(y~x)
plot(resid(res)~fitted(res)); abline(h=0,lty=2) # the spread gets larger for larger values of x 

e = resid(res)                                # residuals
std.e = (e-mean(e))/sd(e)                     # standardized residuals
plot(sqrt(abs(std.e))~fitted(res))            # the spread of the points gets larger for larger values of hat.y 
	plot(res,which=3)                             # it corresponds to the third plot

####################### COOK'S DISTANCE PLOT (residuals vs leverage) ########################         
emissions<-read.table("emissions.txt",header=TRUE)
head(emissions, n=3)      # data on CO2 emissions and per-capita gross domestic product (GDP) for several countries
res = lm(CO2~perCapita, data=emissions)
plot(CO2~perCapita,data=emissions)
abline(res)
	attach(emissions)
identify(perCapita,CO2,n=2, labels=rownames(emissions))        
	detach(emissions)
	cooks.distance(res)    # Cook's distance, calculated as the difference of hat.y_i with and without the i-th observation
                             # high value of Cook's distance is signal of a influential point
plot(res,which=4)                             # diagnostic graph for influential points >1 INFLUENTIAL
n = length(emissions$CO2)

####################### Statistical Inferences - CREATE A VECTOR WITH THE ESTIMATES #################
n.simu = 10; sigma0 = 5                                     # standard error of the residual
hat.th = array(0,c(n.simu,3))      		# where we store hat.beta_0, hat.beta_1, (hat.sigma)^2
x = rep(1:10,10)  			 	# predictor values
plot(y~x); abline(res)
for(i in 2:n.simu) { 	y = x+rnorm(100,sd=sigma0)                   # simulate response values
  				res = lm(y~x)                                
  				hat.th[i,1:2] = coef(res)                    # coefficients
  				hat.th[i,3] = deviance(res)/(100-2)          # s^2 = RSS/df = unbiased estimate of sigma0^2
  				abline(res)                                  # add the i-th regression line
			}
titl = expression(paste("Simulated data  ",Y[i]==x[i]+epsilon[i]," ,  ",  sigma==5))
mtext(titl,side=3,line=1,cex=1.2)

old.par = par(no.readonly = TRUE)            
par(fig=c(0, 0.5, 0.2, 0.8)) 
	hist(hat.th[,1],prob=TRUE)  
	lines(density(hat.th[,1])) 
	SE_0 = sigma0*sqrt(sum(x^2)/ (length(x)*sum((x-mean(x))^2)))         			# Standard Error of hat.beta_0
	lines(sort(hat.th[,1]),dnorm(sort(hat.th[,1]),mean=0,sd=SE_0),lty=2,col=2,lwd=2)	# theoretical distribution of hat.beta_0
par(fig=c(0.5, 1, 0.2, 0.8), new=TRUE)        
	hist(hat.th[,2],prob=TRUE)              
	lines(density(hat.th[,2]))    
	SE_1 = sigma0*sqrt(1/sum((x-mean(x))^2))  
	lines(sort(hat.th[,2]),dnorm(sort(hat.th[,2]), mean=1,sd=SE_1),lty=2,col=2,lwd=2)# theoretical distribution of hat.beta_1

par(old.par) 
qqnorm(hat.th[,1],main="Normal QQplot beta_0"); qqline(hat.th[,1]) 
qqnorm(hat.th[,2],main="Normal QQplot beta_1")# normal Q-Q plot of hat.beta_1 ; qqline(hat.th[,2])
score = hat.th[,3]*(100-2)/sigma0^2     		# s^2 chis-quare distributed (after proper rescaling)
plot(qchisq(ppoints(score),100-2),sort(score), main="Chi-squared QQplot sigma.sq"); abline(0,1)                                   

# back to mhr data
age = rep(seq(20,60,by=5), 3)                
mhr = 209-0.7*age+rnorm(length(age),sd=4)     
res.mhr = lm(mhr~age)
	summary(res.mhr) 
n = length(age)
hat.sigma = sqrt( sum(resid(res.mhr)^2)/(n-2) )  # s^2 = RSS/df = unbiased estimate of sigma0^2 = pag18
	sqrt(deviance(res.mhr)/(n-2))              # the same RSS/(n-2)
	c(sigma=hat.sigma)
	summary(res.mhr)$sigma
	names(summary(res.mhr))
	unclass(summary(res.mhr))                     # gives 'summary()' as a list

                                              # the table named 'Coefficients' reports the
                                              # least squared estimates of bets_0 and beta_1,
                                              # the corresponding SE, the t-score and p-vale for
                                              # H_0: bata_i=0 vs beta_i!=0,  i=0,1

hat.beta_0 = coef(res.mhr)[1]  # Inference on beta_0
	summary(res.mhr)	
		SE_0 = hat.sigma* sqrt( sum(age^2)/(n*sum( (age-mean(age))^2)) ) # SE of hat.beta_0 by formula
			c(Estimate=hat.beta_0,Std._Error=SE_0);
		t_score = (hat.beta_0-0)/SE_0                  # H_0: beta_0=0 vs H_1: beta_0!=0
		p_value = 2*(1-pt(abs(t_score),df=n-2))        
			c(t_value=t_score,p=p_value)                  # intercept is significantly different from zero
		round(hat.beta_0+qt(c(0.025,0.975),n-2)*SE_0,2) # 95% confidence interval
	summary(res.mhr)$coef               
hat.beta_1 = coef(res.mhr)[2]  # Inference on beta_1
	summary(res.mhr)
		SE_1 = hat.sigma*sqrt( 1/sum((age-mean(age))^2 )) 
			c(Estimate=hat.beta_1,Std._Error=SE_1)
		t_score = (hat.beta_1-0)/SE_1    # H_0: beta_1=0 vs H_1: beta_1!=0
		p_value = 2*(1-pt(abs(t_score),df=n-2))  
			c(t_value=t_score,p=p_value)                 
		round(hat.beta_1+qt(c(0.025,0.975),n-2)*SE_1,2) 
	summary(res.mhr)$coef                        


####################### Anova and Coefficient of Determination (OPTIONAL) ###################
r.sq = cor(age,mhr)^2                          # equal to square of Pearson correlation coeff
c(R_Squared=r.sq)
summary(res.mhr)$r.squared                    
	TSS = sum( (mhr-mean(mhr))^2 )                 # Total Sum of Squares TSS
	RSS = sum( (mhr-fitted(res.mhr))^2 )           # Residual Sum of Squares RSS
	ESS = Reg.SS = sum( (fitted(res.mhr)-mean(mhr))^2 )  # Regression Sum of Squares ESS explained
	RSS; TSS-Reg.SS; deviance(res.mhr)
	R2 = ESS / TSS; R2; r.sq         # R2 Coefficient of Determination
anova(res.mhr)                                # Analysis of Variance
	F_score = Reg.SS/(deviance(res.mhr)/(n-2))     # H_0: beta_1=0 vs H_1: beta_1!=0 i.e. first row of anova(res.mhr) 
	t_score = (hat.beta_1-0)/SE_1                  # t-test statistic for the same hypothesis test

F_score; t_score^2                            # the same
1-pf(F_score,df1=1,df2=n-2)                   # p-value of the F-test for 
                                              #         H_0: beta_1=0 vs H_1: beta_1!=0
                                              # correponds to first row of anova(res.mhr)

2*(1-pt(abs(t_score),df=n-2))                 # the same p-value given by t-test